/*LIS Cross-section Data center in Luxembourg*/

/*email: usersupport@lisdatacenter.org*/

/*LIS Self Teaching Package 2022*/
/*Part I: Inequality, poverty, and social policy*/
/*SAS version*/

/*last change of this version of the syntax: 15-01-2022*/

/*The exercises in Part I demonstrate the use of household income data along with 
useful programming techniques for working with the LIS data. With a focus on 
descriptive statistics, the exercises will lead you through the process of developing
a complete program that examiones inequality and poverty across countries.*/


/*Exercise 1: Accessing the LIS databases*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NOCENTER LABEL NONUMBER LS=200 PS=MAX;
PROC MEANS DATA=&gt06h;
 VAR dhi;
RUN;


/*Exercise 2: Sample selection and weighting*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;
TITLE "";

DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_u=. | hpub_a=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

TITLE "Unweighted results, all cases" ;
PROC FREQ DATA=current ;
  TABLES miss_comp ;
RUN ;
PROC MEANS DATA=current N MEAN MEDIAN MIN MAX ;
  VAR dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt ;
RUN ;
PROC FREQ DATA=current ;
  TABLES grossnet ;
RUN ;
TITLE "Weighted results, all cases" ;
PROC MEANS DATA=current N MEAN MEDIAN MIN MAX ;
  VAR dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt ;
  WEIGHT ipwgt ;
RUN ;
TITLE "Weighted results, missing income cases dropped" ;
PROC MEANS DATA=current N MEAN MEDIAN MIN MAX ;
  VAR dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt ;
  WEIGHT ipwgt ;
  WHERE miss_comp = 0 ;
RUN ;

PROC FREQ DATA=&gt06h;
  TABLES currency;
RUN;


/*Exercise 3: Working with household income variables (top and bottom coding and equivalence scales)*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;
TITLE "";
DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet did) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_a=. | hpub_u=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	IF miss_comp = 1 THEN DELETE ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

TITLE "Top/bottom-coding" ;
DATA current ;
 SET current ;
	dhitb  = dhi ;
 ***Bottom and top coding / outlier detection*** ;
 * Select only records when DHI non-missing;
	IF dhitb=. THEN DELETE;
 * recode negative DHI into zero
	IF (dhi<0)  THEN dhitb=0;
	dhilog=log(dhitb); 
 * keep negatives and 0 in the overall distribution of non-missing dhi;
	IF( (dhilog=.)  AND (dhitb^=.) ) THEN dhilog=0;
	RUN;
 * detect interquartile range;
 * NOTE: Defining percentiles requires that the dataset is sorted by equivalized income ;
	PROC SORT DATA=current ;
	  BY did dhilog;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR dhilog   ; 
   WEIGHT hpopwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
iqr=&t-&b; 
* detect upper bound for extreme values;
upper_bound=&t + (iqr * 3) ; 
lower_bound=&b - (iqr * 3); 
* top code income at upper bound for extreme values ; 
IF dhitb>exp(upper_bound) THEN dhitb=exp(upper_bound) ; 
IF dhitb<exp(lower_bound) THEN dhitb=exp(lower_bound); 
RUN ; 

TITLE "Income per Capita and Equivalized Income before top/bottom-coding" ;
PROC MEANS DATA=current MEAN MEDIAN MIN MAX ;
  VAR dhi dhitb;
  WEIGHT  hpopwgt ;
RUN ;

TITLE "Income per Capita and Equivalized Income after top/bottom-coding" ;
DATA current ;
 SET current ;
	edhi  = dhitb / SQRT(nhhmem) ;
	dhipc = dhitb / nhhmem       ;
RUN ;
PROC MEANS DATA=current MEAN MEDIAN MIN MAX ;
  VAR dhipc edhi ;
  WEIGHT ipwgt ;
RUN ;


/*Exercise 4: Inequality: The Gini Index*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;

%MACRO Gini ;
	PROC SORT DATA=&dataset ;
	  BY &var ;
	RUN ;
	DATA Gini (KEEP=gini) ;
	    IF _N_ = 1 THEN
	        DO UNTIL (last) ;
	            SET &dataset END=last;
				swt + &wgt ;
				swtey + (&wgt*&var) ;
	        END ;
	    SET current END=eof;
	        IF _N_ = 1 THEN
	            DO ;
	                prewt = 0 ;
					preey = 0 ;
					up    = 0 ;
					sum   = 0 ;
	            END ;
	           cwt + &wgt ;
		   	cwtey + (&var*&wgt);
		   	pcwt   = cwt / swt * 100;
		   	pcwtey = cwtey / swtey * 100;
		   	up     = (pcwt-prewt) * (pcwtey+preey) ;
		   	sum + up ;
		   	prewt = pcwt ;
	       	preey = pcwtey ;

		   	RETAIN prewt preey ;
			IF eof THEN
	            DO ;
	               gini=1-(sum / 10000) ;
	               OUTPUT ;
	            END ;
	RUN;
	PROC MEANS DATA=Gini MEAN ;
	RUN;
%MEND Gini ;
TITLE "";
DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet did) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_a=. | hpub_u=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	IF miss_comp = 1 THEN DELETE ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

DATA current ;
 SET current ;
	dhitb  = dhi ;
	IF dhitb=. THEN DELETE;
	IF (dhi<0)  THEN dhitb=0;
	dhilog=log(dhitb); 
	IF( (dhilog=.)  AND (dhitb^=.) ) THEN dhilog=0;
	RUN;
	PROC SORT DATA=current ;
	  BY did dhilog;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR dhilog   ; 
   WEIGHT hpopwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
iqr=&t-&b; 
upper_bound=&t + (iqr * 3) ; 
lower_bound=&b - (iqr * 3); 
IF dhitb>exp(upper_bound) THEN dhitb=exp(upper_bound) ; 
IF dhitb<exp(lower_bound) THEN dhitb=exp(lower_bound); 
RUN ; 
RUN ;
DATA current ;
 SET current ;
	dhipc = dhitb / nhhmem       ;
	edhi  = dhitb / SQRT(nhhmem) ;
RUN ;

TITLE "Household income" ;
%LET dataset = current;
%LET var     = dhitb  ;
%LET wgt     = hpopwgt;
%gini

TITLE "Income per Capita";
%LET dataset = current;
%LET var     = dhipc  ;
%LET wgt     = ipwgt  ;
%gini

TITLE "Equivalised income";
%LET dataset = current;
%LET var     = edhi   ;
%LET wgt     = ipwgt  ;
%gini


/*Exercise 5: Relative poverty rates*/

OPTIONS NOFMTERR NONOTES NOSOURCE NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;
TITLE "";

DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpub_i hpub_u hpub_a hiprivate hxitsc hpopwgt nhhmem grossnet did) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_a=. | hpub_u=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	IF miss_comp = 1 THEN DELETE ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

PROC MEANS DATA=current NOPRINT ;
  VAR dhi ;
  WEIGHT hpopwgt ;
OUTPUT OUT=tmp MEDIAN=med;
RUN ;
DATA _NULL_;
 SET tmp;
    CALL SYMPUT("wins",med);
RUN;
DATA current ;
 SET current ;
	dhitb  = dhi ;
	IF dhitb=. THEN DELETE;
	IF (dhi<0)  THEN dhitb=0;
	dhilog=log(dhitb); 
	IF( (dhilog=.)  AND (dhitb^=.) ) THEN dhilog=0;
	RUN;
	PROC SORT DATA=current ;
	  BY did dhilog;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR dhilog   ; 
   WEIGHT hpopwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
iqr=&t-&b; 
upper_bound=&t + (iqr * 3) ; 
lower_bound=&b - (iqr * 3); 
IF dhitb>exp(upper_bound) THEN dhitb=exp(upper_bound) ; 
IF dhitb<exp(lower_bound) THEN dhitb=exp(lower_bound); 
RUN ; 
DATA current ;
 SET current ;
	edhi  = dhitb / SQRT(nhhmem) ;
RUN ;

PROC MEANS DATA=current NOPRINT;
  VAR edhi ;
  WEIGHT ipwgt ;
OUTPUT OUT=temp MEDIAN=mededhi;
RUN ;
DATA _NULL_;
SET temp;
   CALL SYMPUT("m",mededhi);
RUN;
DATA current ;
 SET current ;
	povlin = &m * 0.5 ;
RUN ;
DATA current ;
 SET current ;
	poor   = 0 ;
	IF edhi < povlin THEN poor = 1 ;
RUN ;

TITLE "Relative poverty rate - Household level"; 
PROC FREQ DATA=current;
   TABLES poor ;
   WEIGHT hpopwgt;
RUN  ;
TITLE "Relative poverty rate - Individual level"; 
PROC FREQ DATA=current;
   TABLES poor ;
   WEIGHT ipwgt;
RUN ;


/*Exercise 6: Comparing income concepts*/

OPTIONS NONOTES NOSOURCE NOFMTERR NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX;
TITLE "";
/**-----------------------------**/
/** PART I: DEFINE SUB-ROUTINES **/ 
/**-----------------------------**/
%MACRO Gini ;
	PROC SORT DATA=&dataset ;
	  BY &var ;
	RUN ;
	DATA Gini (KEEP=gini) ;
	    IF _N_ = 1 THEN
	        DO UNTIL (last) ;
	            SET &dataset END=last;
				swt + &wgt ;
				swtey + (&wgt*&var) ;
	        END ;
	    SET current END=eof;
	        IF _N_ = 1 THEN
	            DO ;
	                prewt = 0 ;
					preey = 0 ;
					up    = 0 ;
					sum   = 0 ;
	            END ;
	           cwt + &wgt ;
		   	cwtey + (&var*&wgt);
		   	pcwt   = cwt / swt * 100;
		   	pcwtey = cwtey / swtey * 100;
		   	up     = (pcwt-prewt) * (pcwtey+preey) ;
		   	sum + up ;
		   	prewt = pcwt ;
	       	preey = pcwtey ;

		   	RETAIN prewt preey ;
			IF eof THEN
	            DO ;
	               gini=1-(sum / 10000) ;
	               OUTPUT ;
	            END ;
	RUN;
	PROC MEANS DATA=Gini MEAN ;
	RUN;
%MEND Gini ;
%MACRO equival ;
 %LET i = 1 ;
 %DO %UNTIL (&i > 4) ;
	%LET tmpv  = %SCAN(&incTyp,&i) ;
	DATA current ;
	 SET current ;
		e&tmpv  = &tmpv ;
		IF e&tmpv=. THEN DELETE;
		IF (&tmpv<0)  THEN e&tmpv=0;
		log&tmpv=log(e&tmpv); 
		IF( (log&tmpv=.)  AND (e&tmpv^=.) ) THEN log&tmpv=0;
	RUN;
	PROC SORT DATA=current ;
	  BY did log&tmpv;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR log&tmpv   ; 
   WEIGHT hpopwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
	iqr=&t-&b; 
	upper_bound=&t + (iqr * 3) ; 
	lower_bound=&b - (iqr * 3); 
	IF e&tmpv>exp(upper_bound) THEN e&tmpv=exp(upper_bound) ; 
	IF e&tmpv<exp(lower_bound) THEN e&tmpv=exp(lower_bound); 
	e&tmpv = e&tmpv	/ SQRT(nhhmem) ;
RUN ;
	%LET i = %EVAL(&i+1) ;
 %END ;
%MEND equival ;

%MACRO indic ;
 %LET j = 1 ;
 %DO %UNTIL (&j > 4) ;
	%LET etmpv = %SCAN(&incTyp,&j) ;
	%LET dataset = current;
	%LET var     = &etmpv ;
	%LET wgt     = ipwgt  ;
	%gini ;
	DATA current ;
	 SET current ;
	 	poor&j = 0 ;
		IF &etmpv < povlin THEN poor&j = 1 ;
	RUN ;		
	PROC FREQ DATA=current;
	   TABLES poor&j ;
	   WEIGHT ipwgt;
	RUN ;
	%LET j = %EVAL(&j+1) ;
 %END ;
%MEND indic ;

/**-----------------------------------**/
/**  PART II: RUN THE MAIN PROGRAMME  **/ 
/**-----------------------------------**/
DATA current ;
 SET &gt06h (KEEP=dhi hifactor hpublic hpub_i hpub_u hpub_a hi33 hiprivate hxitsc hpopwgt nhhmem grossnet did) ;
	miss_comp = 0 ;
	IF 	(dhi=. | hpub_i=. | hpub_u=. | hpub_a=. | hi33=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ;
	IF miss_comp = 1 THEN DELETE ;
	ipwgt =  hpopwgt*nhhmem ; 
RUN ;

PROC MEANS DATA=&gt06h;
 VAR hpublic hpub_i hpub_u hpub_a ;
RUN;

DATA current ;
 SET current ;
	dhitb = dhi                                  ;
 	mi    = (sum(hifactor,hiprivate,hi33))  				      ;
	siti  = (sum(hifactor,hiprivate,hi33,hpub_i,hpub_u,-hxitsc)) ;
     sa = (sum(hifactor,hiprivate,hi33,hpub_a)) ;
RUN ;
%LET incTyp = mi siti sa dhitb ;
%equival


PROC MEANS DATA=current NOPRINT;
  VAR edhitb ;
  WEIGHT ipwgt ;
  OUTPUT OUT=temp MEDIAN=mededhi;
RUN ;
DATA _NULL_;
 SET temp;
   CALL SYMPUT("m",mededhi);
RUN;
DATA current ;
 SET current ;
	povlin = &m * 0.5 ;
RUN ;

%LET incTyp = emi esiti esa edhitb ;
%indic


/*Exercise 7: Comparing multiple countries*/

OPTIONS NONOTES NOSOURCE NOFMTERR NODATE NONUMBER NOCENTER LABEL LS=MAX PS=MAX ;
TITLE "";
/**-----------------------------**/
/** PART I: DEFINE SUB-PROGRAMS **/ 
/**-----------------------------**/
%MACRO equival ;
	%LET i = 1 ;
	%DO %UNTIL (&i > 4) ;
		%LET tmpv  = %SCAN(&incTyp,&i) ;
	DATA current ;
	 SET current ;
		e&tmpv  = &tmpv ;
		IF e&tmpv=. THEN DELETE;
		IF (&tmpv<0)  THEN e&tmpv=0;
		log&tmpv=log(e&tmpv); 
		IF( (log&tmpv=.)  AND (e&tmpv^=.) ) THEN log&tmpv=0;
	RUN;
	PROC SORT DATA=current ;
	  BY did log&tmpv;
RUN ;
PROC UNIVARIATE DATA=current NOPRINT;
 VAR log&tmpv   ; 
   WEIGHT hwgt; 
    OUTPUT OUT= temp P25=q25   P75=q75; 
RUN ; 
DATA _NULL_; 
  SET temp; 
    CALL SYMPUT("b",q25); 
    CALL SYMPUT("t",q75); 
 RUN; 
DATA current ; 
SET current ;  
	iqr=&t-&b; 
	upper_bound=&t + (iqr * 3) ; 
	lower_bound=&b - (iqr * 3); 
	IF e&tmpv>exp(upper_bound) THEN e&tmpv=exp(upper_bound) ; 
	IF e&tmpv<exp(lower_bound) THEN e&tmpv=exp(lower_bound); 
	e&tmpv = e&tmpv	/ SQRT(nhhmem) ;
RUN ;
	%LET i = %EVAL(&i+1) ;
 %END ;
%MEND equival ;
 
%MACRO Gini ;
	PROC SORT DATA=&dataset ;
	  BY &var ;
	RUN ;
	DATA Gini (KEEP=gini) ;
	    IF _N_ = 1 THEN
	        DO UNTIL (last) ;
	            SET &dataset END=last;
				swt + &wgt ;
				swtey + (&wgt*&var) ;
	        END ;
	    SET current END=eof;
	        IF _N_ = 1 THEN
	            DO ;
	                prewt = 0 ;
					preey = 0 ;
					up    = 0 ;
					sum   = 0 ;
	            END ;
	           cwt + &wgt ;
		   	cwtey + (&var*&wgt);
		   	pcwt   = cwt / swt * 100;
		   	pcwtey = cwtey / swtey * 100;
		   	up     = (pcwt-prewt) * (pcwtey+preey) ;
		   	sum + up ;
		   	prewt = pcwt ;
	       	preey = pcwtey ;

		   	RETAIN prewt preey ;
			IF eof THEN
	            DO ;
	               gini=1-(sum / 10000) ;
	               OUTPUT ;
	            END ;
	RUN;
	PROC MEANS DATA=Gini MEAN ;
	RUN;
%MEND Gini ;

%MACRO indic ;
	%LET j = 1 ;
	%DO %UNTIL (&j > 4) ;
		%LET etmpv = %SCAN(&incTyp,&j) ;
		TITLE2 "Variable  : &etmpv" ;
		TITLE3 "Indicator : Gini";  ;
		%LET dataset = current;
		%LET var     = &etmpv ;
		%LET wgt     = ipwgt  ;
		%gini
		TITLE3 "Indicator : Poverty rates";  
		DATA current ;
		 SET current ;
		 	poor&j = 0 ;
			IF &etmpv < povlin THEN poor&j = 1 ;
		RUN ;		
		PROC FREQ DATA=current;
		   TABLES poor&j ;
		   WEIGHT ipwgt;
		RUN ;
		%LET j = %EVAL(&j+1) ;
	%END ;
%MEND indic ;

%MACRO multi  ;
	%LET k = 1 ;
	%DO %UNTIL (&k > 5) ;
		%LET ccyy = %SCAN(&all,&k) ;
		TITLE1 "Country   : &ccyy" ;

		
DATA current ; 
  SET &&&ccyy.h (KEEP=dhi hifactor hi33 hpublic hpub_i hpub_u hpub_a hiprivate hxitsc hwgt nhhmem grossnet did); 
miss_comp = 0 ; 
 IF  (dhi=. | hpub_i=. | hpub_u=. | hpub_a=. | hi33=. | hiprivate=. | hxitsc=.) THEN miss_comp = 1 ; 
 IF miss_comp = 1 THEN DELETE ; 
 ipwgt =  hwgt*nhhmem ;  
RUN ; 
 
PROC FREQ DATA=current;
TABLES grossnet;
RUN;

PROC MEANS DATA=current;
 VAR hpublic hpub_i hpub_u hpub_a;
RUN;
		PROC MEANS DATA=current NOPRINT ;
		  	VAR dhi ;
		  	WEIGHT hwgt ;
			OUTPUT OUT=tmp MEDIAN=med;
		RUN ;
		DATA _NULL_;
		 SET tmp;
		    CALL SYMPUT("wins",med);
		RUN;

		
DATA current ; 
 SET current ; 
 dhitb = dhi                                  ; 
  mi    = (sum(hifactor,hiprivate,hi33))         ; 
 siti  = (sum(hifactor,hiprivate,hi33,hpub_i,hpub_u,-hxitsc)) ; 
     sa = (sum(hifactor,hiprivate,hi33,hpub_a)) ; 
RUN ; 
%LET incTyp = mi siti sa dhitb ; 
%equival 
 %LET incTyp = mi siti sa dhitb ; 
 %equival  		
		RUN ;

		%LET incTyp = mi siti sa dhitb ;
		%equival

		PROC MEANS DATA=current NOPRINT;
		  VAR edhitb ;
		  WEIGHT ipwgt ;
		  OUTPUT OUT=temp MEDIAN=mededhi;
		RUN ;
		DATA _NULL_;
		 SET temp;
		   CALL SYMPUT("m",mededhi);
		RUN;
		DATA current ;
		 SET current ;
			povlin = &m * 0.5 ;
		RUN ;

		%LET incTyp = emi esiti esa edhitb ;
		%indic

		%LET k = %EVAL(&k+1) ;
	%END ;

%MEND multi ;
/**------------------------------**/
/**  PART II: RUN THE ROUTINES   **/ 
/**------------------------------**/
TITLE1 "";
TITLE2 "";
TITLE3 "";
%LET all = gt06 us04 dk04 hu05 il05;
%multi
